Impact of changing some key XGB features on the learning curves
Some notes from XGB documentation
Results
Details below
Default max_depth of 6 seems to be doing fine. Though 7 might be better
gamma default is 0 (not plotted) but higher gamma gave better result
subsample default of 1 looks ok
colsample_bytree default of 1 is not good. It depends on the number of features and has to be tuned
L1 reg parameter alpha |weight| default of 0 seems ok
L2 reg parameter lambda \(weight^{2}\) default of 1 is ok
though subsequent run gave somewhat different results
xgb_model <- xgb.train(params =
list(
max_depth = 4
,gamma = 3.6674
,min_child_weight = 500
,subsample = 0.9
,colsample_bytree = 0.1
,alpha = 0.1
,lambda = 0.9
)
,data = my_train
,watchlist = watchlist
,nrounds = 1000
,eta = 0.2
,early_stopping_rounds = 10
,print_every_n = 50
,verbose = TRUE)
## [1] train-rmse:5.878344 test-rmse:5.876797
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [51] train-rmse:0.999144 test-rmse:0.995726
## [101] train-rmse:0.986275 test-rmse:0.984546
## [151] train-rmse:0.982668 test-rmse:0.981939
## [201] train-rmse:0.980920 test-rmse:0.980867
## [251] train-rmse:0.979738 test-rmse:0.980467
## [301] train-rmse:0.978951 test-rmse:0.980212
## [351] train-rmse:0.978281 test-rmse:0.979952
## [401] train-rmse:0.977703 test-rmse:0.979739
## [451] train-rmse:0.977038 test-rmse:0.979580
## Stopping. Best iteration:
## [446] train-rmse:0.977106 test-rmse:0.979570
ggplot(melt(xgb_model$evaluation_log,
measure.vars = c("train_rmse","test_rmse"))) +
geom_line(aes(x=iter, y=value, color=variable)) +
scale_y_continuous(limits = c(0.96,1))
## Warning: Removed 87 rows containing missing values (geom_path).
importance <- xgb.importance(model = xgb_model)
importance[1:10]
## Feature Gain Cover Frequency
## 1: no_of_rooms 0.19133811 0.031373968 0.030150754
## 2: length_of_stay 0.15168774 0.031859274 0.029480737
## 3: numberofadults.2 0.05022235 0.006674966 0.009380235
## 4: roomnights 0.04997214 0.033379472 0.040871022
## 5: persontravellingid.47 0.04788985 0.013196053 0.008710218
## 6: total_pax.2 0.03133324 0.019817782 0.015410385
## 7: total_pax_outlier2.1 0.02522512 0.008912954 0.012060302
## 8: resort_type_code.3 0.02252866 0.005097424 0.008375209
## 9: resort_id.Rst-18 0.01885279 0.006187992 0.006700168
## 10: lead_time 0.01824837 0.030451573 0.046566164
xgb.plot.importance(importance[1:30])
xgb.plot.tree(model=xgb_model,trees = 10)
# export_graph(graph = xgb.plot.tree(model=xgb_model,trees = i,render=FALSE)
# ,file_name = here::here('300_output'
# ,'individual_trees'
# ,paste0('20190512_sampletree-depth4_no',i,'.png'))
# ,width=2000
# ,height=2000)
xgb.plot.multi.trees(model = xgb_model
,feature_names = xgb_model$feature_names
,features_keep = 5
,render = FALSE)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
xgb.ggplot.deepness(model=xgb_model)
xgb.ggplot.deepness(model=xgb_model, which = "max.depth")
xgb.ggplot.deepness(model=xgb_model, which = "med.depth")
xgb.ggplot.deepness(model=xgb_model, which = "med.weight")
temp <- xgb.plot.deepness(model=xgb_model, plot = FALSE)
dim(temp)
## [1] 3408 5
temp[Tree == 10]
## ID Tree Depth Cover Weight
## 1: 10-14 10 4 1468 0.2361592
## 2: 10-15 10 5 13074 0.1830244
## 3: 10-16 10 5 2951 0.2015547
## 4: 10-17 10 5 504 0.2105512
## 5: 10-18 10 5 629 0.2376156
## 6: 10-19 10 5 148106 0.1515135
## 7: 10-20 10 5 48595 0.1315591
## 8: 10-21 10 5 10752 0.1803118
## 9: 10-22 10 5 1993 0.2127287
## 10: 10-23 10 5 945 0.2151556
## 11: 10-24 10 5 503 0.1836387
## 12: 10-25 10 5 9841 0.1717489
## 13: 10-26 10 5 533 0.2154512
## 14: 10-27 10 5 3053 0.1987544
## 15: 10-28 10 5 2827 0.2123805
xgb.plot.shap(data = my_train
,model = xgb_model)
# p_load(devtools)
# install_github("AppliedDataSciencePartners/xgboostExplainer")
library(xgboostExplainer)
explainer <- buildExplainer(xgb_model
,my_train
,type="regression"
,base_score = 0.5
,trees_idx = NULL)
#* Threshold set to 0.01
showWaterfall(xgb_model
,explainer
,my_test
,cbind(my_test_4xgb_explainer,my_test_labels)
,100
,type = "regression"
,threshold = 0.01)
#pred.breakdown <- explainPredictions(xgb_model, explainer, my_test)
# Other XGB plots [TO BE REVIEWED]
#xgb.plot.multi.trees(model = xgb_model)
#xgb.plot.deepness(model=xgb_model)
#xgb.plot.shap()
xgb_model$best_iteration
xgb.plot.tree(model=xgb_model,trees = 75, plot_height = 2000)
p_load(DiagrammeR)
p_load(DiagrammeRsvg)
p_load(rsvg)
export_graph(graph = xgb.plot.tree(model=xgb_model,trees = 1236,render=FALSE)
,file_name = here::here('230_src_RMD-and-output','tree.png'), width=1500, height=1900)
# Feature contributions [HOW CAN THIS BE USED]
#pred <- predict(xgb_model,newdata = my_test ,predcontrib = TRUE)
#Find linear combinations or correlated variables and remove from data
if(!require(caret,quietly = TRUE)) { install.packages("caret"); library(caret)}
mtcars$wt_new <- mtcars$wt * 2
#NAs gives error
lincomb <- findLinearCombos(mtcars[complete.cases(mtcars),])
lincomb$linearCombos[[1]]
colnames(mtcars)[lincomb$linearCombos[[1]]]
mtcars[-lincomb$remove]
So you may leave all your features in and run a few iterations to see how important/not they are and the ones that consistently lie at the bottom of the var imp chart can be excluded from subsequent runs to improve computational performance
In random forests, the optimal setting for feature subsampling is usually the square root of the number of features:
Unused features Moved here
Notes on manual parameter tuning, before the paramter - plots
Increase min_child_weight * with min_child_weight = 3000 (approx 1% of nrow) + Stopping. Best iteration: [231] train-rmse:0.979333 test-rmse:0.976716 + also look at hardly any difference in train-test => less overfit ?
Fix min_child_weight = 3000
Fix colsample_bytree = 0.1 and Adjust gamma - minimum loss reduction required to make a split.
Optimal gamma : 2 to 4
Revert gamma to the optimal
min_child_weight Optimal anywhere between 50 to 500
Param Check : eta = 0.2,max_depth=5,gamma = 3.6674,min_child_weight = 500 ,subsample = 1,colsample_bytree = 0.1
test-rmse:0.974572
Improved
Fix alpha optimal around 0.5
Conclusion * On 300 features model of eta 0.2 and max depth of 5 * min_child_weight of 500 made big reduction in test-rmse and also train-vs-test rmse difference * colsample_bytree = 0.1 also reduced test-rmse * gamma optimal between 2 - 4 * alpha default 0 to 0.5 improved performance a bit * Reducing subsample worsens the performance * Reducing max_depth from 5 worsens the performance
max_depth = 6 Stopping. Best iteration: [285] train-rmse:0.975438 test-rmse:0.974898
glimpse(DT)
xgb_model <- xgb.train(
params = list(
booster = "gbtree"
,objective = "reg:linear"
,eval_metric = "rmse"
,eta = 0.2
,max_depth=10
,gamma = 3.6674
,min_child_weight = 500
,subsample = 1
,colsample_bytree = 0.1
,alpha = 0.5
)
,data = my_train
,watchlist = watchlist
,nrounds = 10000
,early_stopping_rounds = 5
,verbose = TRUE
,prediction = TRUE)
xgb_model$evaluation_log
unlist(xgb_model$params)
Early stopping of 10 gives Best iteration: [368] train-rmse:0.973267 test-rmse:0.972976
xgb_model <- xgb.train(params =
list(
max_depth = 7
,gamma = 3.6674
,min_child_weight = 500
,subsample = 0.9
,colsample_bytree = 0.1
,alpha = 0.1
,lambda = 0.9
)
,data = my_train
,watchlist = watchlist
,nrounds = 1000
,eta = 0.2
,early_stopping_rounds = 10
,print_every_n = 10
,verbose = TRUE)
# Plot learning curve
ggplot(melt(xgb_model$evaluation_log,
measure.vars = c("train_rmse","test_rmse"))) +
geom_line(aes(x=iter, y=value, color=variable)) +
scale_y_continuous(limits = c(0.96,1))
importance <- xgb.importance(model = xgb_model)
importance[1:30]
xgb.plot.importance(importance[1:30])
* Tree for iteration no 368
`)
export_graph(graph = xgb.plot.tree(model=xgb_model,trees = 368,render=FALSE)
,file_name = here::here('300_output','20190512_tree-depth7-rmse0_9729.png'), width=2000, height=2000)
fn_agg <- function(DT,keycols_list,FUNs,measure_col) {
for(i in keycols_list) {
for(j in FUNs) {
new_col_name <- paste0(paste0(unlist(i),collapse ="_"),"_",eval(j))
DT[,(new_col_name) := lapply(.SD,get(j)), by = i, .SDcols = measure_col]
}
}
}
fn_ratio <- function(DT,col_pair) {
for(i in col_pair) {
new_col_name <- paste0(paste0(unlist(i),collapse ="_"),"_ratio")
print(eval(i[2]))
DT[,(new_col_name) := ifelse(get(i[2]) == 0 | is.na(get(i[2]))
,0
,get(i[1]) / get(i[2])) ]
}
}
fn_diff <- function(DT,col_pair) {
for(i in col_pair) {
new_col_name <- paste0(paste0(unlist(i),collapse ="_"),"_diff")
print(eval(i[2]))
DT[,(new_col_name) := get(i[1]) - get(i[2]) ]
}
}
str(DT)
ST <- Sys.time()
member_list <- list(
c("flag","memberid","channel_code")
,c("flag","memberid","main_product_code")
,c("flag","memberid","persontravellingid")
,c("flag","memberid","resort_region_code")
,c("flag","memberid","resort_type_code")
,c("flag","memberid","room_type_booked_code")
,c("flag","memberid","season_holidayed_code")
,c("flag","memberid","state_code_residence")
,c("flag","memberid","state_code_resort")
,c("flag","memberid","booking_type_code")
,c("flag","memberid","cluster_code")
,c("flag","memberid","reservationstatusid_code")
,c("flag","memberid","resort_id")
,c("flag","memberid","checkin_date_year")
,c("flag","memberid","checkin_date_month")
,c("flag","memberid","checkin_date_week_of_year")
,c("flag","memberid","checkin_date_bom")
,c("flag","memberid","checkin_date_eom")
,c("flag","memberid","checkin_date_dow")
,c("flag","memberid","checkin_date_dow_type")
,c("flag","memberid","booking_date_year")
,c("flag","memberid","booking_date_month")
,c("flag","memberid","booking_date_week_of_year")
,c("flag","memberid","booking_date_bom")
,c("flag","memberid","booking_date_eom")
,c("flag","memberid","booking_date_dow")
,c("flag","memberid","booking_date_dow_type")
,c("flag","memberid","no_of_rooms")
)
fn_agg(DT,member_list,c("sum","mean","median"),"roomnights")
fn_agg(DT,member_list,c("sum","mean","median"),"length_of_stay")
fn_agg(DT,member_list,c("mean","median"),"lead_time")
fn_agg(DT,member_list,c("length"),"reservation_id")
resort_list <- list(
c("flag","resort_id","channel_code")
,c("flag","resort_id","main_product_code")
,c("flag","resort_id","persontravellingid")
# ,c("flag","resort_id","resort_region_code")
# ,c("flag","resort_id","resort_type_code")
# ,c("flag","resort_id","room_type_booked_code")
,c("flag","resort_id","season_holidayed_code")
,c("flag","resort_id","state_code_residence")
,c("flag","resort_id","state_code_resort")
,c("flag","resort_id","booking_type_code")
# ,c("flag","resort_id","cluster_code")
,c("flag","resort_id","reservationstatusid_code")
#,c("flag","resort_id","memberid")
,c("flag","resort_id","checkin_date_year")
,c("flag","resort_id","checkin_date_month")
,c("flag","resort_id","checkin_date_week_of_year")
,c("flag","resort_id","checkin_date_bom")
,c("flag","resort_id","checkin_date_eom")
,c("flag","resort_id","checkin_date_dow")
,c("flag","resort_id","checkin_date_dow_type")
,c("flag","resort_id","booking_date_year")
,c("flag","resort_id","booking_date_month")
,c("flag","resort_id","booking_date_week_of_year")
,c("flag","resort_id","booking_date_bom")
,c("flag","resort_id","booking_date_eom")
,c("flag","resort_id","booking_date_dow")
,c("flag","resort_id","booking_date_dow_type")
,c("flag","resort_id","no_of_rooms")
)
fn_agg(DT,resort_list,c("sum","mean","median"),"roomnights")
fn_agg(DT,resort_list,c("mean","median"),"length_of_stay")
fn_agg(DT,resort_list,c("mean","median"),"lead_time")
fn_agg(DT,resort_list,c("length"),"reservation_id")
ST
Sys.time()
#35mins
####### Aggregates ###
############## Group by Number of bookings per year ############
vars <- c("main_product_code"
,"room_type_booked_code","resort_type_code","cluster_code"
,"resort_id"
,"checkin_date_year","checkin_date_month","checkin_date_dow"
,"state_code_residence","persontravellingid"
,"member_age_buckets","season_holidayed_code")
for(i in vars) {
temp <- c("flag",eval(i))
DT[flag == "train",
paste0(i,"_groupby") := length(reservation_id),
by = temp]
DT[flag == "test",
paste0(i,"_groupby") := length(reservation_id),
by = temp]
}
########### Total Pax outlier exploration ########
DT[,tot_pax_var1 := total_pax - (numberofadults + numberofchildren) ]
DT[,tot_pax_var2 := total_pax - numberofadults ]
vars <- c("channel_code","booking_type_code","main_product_code"
,"room_type_booked_code","resort_type_code","cluster_code"
,"resort_id","reservationstatusid_code"
,"persontravellingid"
,"checkin_date_year","checkin_date_month","checkin_date_dow"
,"member_age_buckets","season_holidayed_code")
for(i in vars) {
temp <- c("flag",eval(i),"tot_pax_var1")
DT[flag == "train",
paste0(i,"_tot_pax_var1") := length(reservation_id),
by = temp]
DT[flag == "test",
paste0(i,"_tot_pax_var1") := length(reservation_id),
by = temp]
}
for(i in vars) {
temp <- c("flag",eval(i),"tot_pax_var2")
DT[flag == "train",
paste0(i,"_tot_pax_var2") := length(reservation_id),
by = temp]
DT[flag == "test",
paste0(i,"_tot_pax_var2") := length(reservation_id),
by = temp]
}
############## Top important feature changed to factor!!!! ############
############# Length of Stay interactions ##################
#DT$length_of_stay_new <- DT$length_of_stay
group_category(data = DT, feature = "length_of_stay", threshold = 0.001,update = TRUE)
vars <- c("booking_type_code","main_product_code"
,"room_type_booked_code","resort_type_code","cluster_code"
,"resort_id"
,"checkin_date_year","checkin_date_month","checkin_date_dow"
,"state_code_residence","persontravellingid"
,"member_age_buckets","season_holidayed_code")
for(i in vars) {
temp <- c("flag",eval(i),"length_of_stay")
DT[flag == "train",
paste0(i,"_length_of_stay") := length(reservation_id),
by = temp]
DT[flag == "test",
paste0(i,"_length_of_stay") := length(reservation_id),
by = temp]
}
############## Second important feature changed to factor!!!! ############
############# No of rooms interactions ##################
group_category(data = DT, feature = "no_of_rooms", threshold = 0.001, update = TRUE)
vars <- c("booking_type_code","main_product_code"
,"room_type_booked_code","resort_type_code","cluster_code"
,"resort_id"
,"checkin_date_year","checkin_date_month","checkin_date_dow"
,"state_code_residence","persontravellingid"
,"member_age_buckets","season_holidayed_code")
for(i in vars) {
temp <- c("flag",eval(i),"no_of_rooms")
DT[flag == "train",
paste0(i,"_no_of_rooms") := length(reservation_id),
by = temp]
DT[flag == "test",
paste0(i,"_no_of_rooms") := length(reservation_id),
by = temp]
}